import pandas as pd
import numpy as np
from scipy.io import loadmat
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import tensorflow as tf
from sklearn import svm
from sklearn.model_selection import train_test_split
pd.set_option('mode.chained_assignment',None)
from sklearn.decomposition import PCA
data = loadmat('ex7/ex7data2.mat')
X = data['X']
df = pd.DataFrame(X)
df.columns = ['x1','x2']
df.plot(x ='x1', y ='x2', kind = 'scatter')
<AxesSubplot:xlabel='x1', ylabel='x2'>
def init_clusters(data, centroids):
"""Returns random features for each centroid within min-max range of data"""
min_feat = data.min()
max_feat = data.max()
k_dict = {}
for i in range(centroids):
k_dict[i] = tuple(
np.random.uniform(low=min_feat[c], high=max_feat[c], size=(1))[0] for c in data.columns
)
return k_dict
def k_means(input_df, k = 3, iterations = 10):
"""Finds clusters using K-means algorithm"""
data = input_df.copy() # do not alter initial df
columns = data.columns
centroid_history = {i: [] for i in range(k)}
cluster_centroids = init_clusters(data, k)
int_clusters = cluster_centroids.copy()
for i in range(iterations):
data['distance'] = np.inf
data['cluster'] = np.nan
for centroid, features in cluster_centroids.items():
centroid_history[centroid].append(features)
new_distance = ((data[columns] - features)**2).sum(axis=1)
assign_mask = new_distance < data['distance']
data['distance'][assign_mask] = new_distance[assign_mask]
data['cluster'][assign_mask] = centroid
for centroid in cluster_centroids:
filtered_data = data[data['cluster'] == centroid]
if not filtered_data.empty:
cluster_centroids[centroid] = tuple(data[data['cluster'] == centroid].mean()[columns])
else:
cluster_centroids[centroid] = init_clusters(data[columns], 1)[0]
centroids = pd.DataFrame(cluster_centroids).T
centroids['distance'] = 0
centroids['cluster'] = centroids.index.astype(str) + '_centroid'
centroids.columns = data.columns
return data, centroids, centroid_history, int_clusters
data, centroids, h, init = k_means(df,iterations = 100)
plot_data = pd.concat([data,centroids])
fig = px.scatter(plot_data,x='x1',y='x2',color='cluster')
fig.update_layout(autosize=False)
fig.add_trace(go.Scatter(
x=[i[0] for i in h[0]],
y=[i[1] for i in h[0]],
mode='lines',
line_color='black',
name='Centroid 0 path'
))
fig.add_trace(go.Scatter(
x=[i[0] for i in h[1]],
y=[i[1] for i in h[1]],
mode='lines',
line_color='black',
name='Centroid 1 path'
))
fig.add_trace(go.Scatter(
x=[i[0] for i in h[2]],
y=[i[1] for i in h[2]],
mode='lines',
line_color='black',
name='Centroid 2 path'
))
image = Image.open('ex7/bird_small.png')
image_array = np.asarray(image)
data_shape = image_array.shape
plt.imshow(image_array)
<matplotlib.image.AxesImage at 0x1ba15158af0>
image_flatten = image_array.reshape(-1, image_array.shape[-1])
im_df = pd.DataFrame(image_flatten)
data, clusters, hist, init = k_means(im_df, k=16, iterations=100)
plot_data = pd.concat([data,clusters])
fig = px.scatter(plot_data,x=0,y=2,color='cluster')
fig.update_layout(autosize=False)
new_df = data.merge(clusters,how='left', left_on = 'cluster', right_index = True, suffixes=('_x', ''),)
new_image_array = round(new_df[['0','1','2']]).astype(int).to_numpy().reshape(data_shape)
plt.imshow(image_array)
# Original image
<matplotlib.image.AxesImage at 0x1ba152e3e50>
plt.imshow(new_image_array)
# New image
<matplotlib.image.AxesImage at 0x1ba154333a0>
data = loadmat('ex7/ex7data1.mat')
df = pd.DataFrame(data['X'], columns = ['x','y'])
fig1 = px.scatter(df,x='x',y='y')
fig1.show()
print(f"Correlation of variables: {df['x'].corr(df['y'])}")
Correlation of variables: 0.7355303763393295
pca = PCA(n_components=2)
pca.fit(df)
print(f"Variance explained: {pca.explained_variance_ratio_}")
Variance explained: [0.87062385 0.12937615]
l = []
for ratio, vector, length in zip(pca.explained_variance_ratio_, pca.components_, pca.mean_):
l.append(pd.DataFrame([pca.mean_, pca.mean_ + ratio * vector * length], columns = ['x','y']))
eigen_vectors = pd.concat(l)
fig1.add_trace(go.Scatter(
x=eigen_vectors['x'],
y=eigen_vectors['y'],
mode='lines',
line_color='black',
name='new coordinates'
))
fig1.update_layout(
width = 800,
height = 500
)
fig1.update_yaxes(
scaleanchor = "x",
scaleratio = 1,
)
transformed_df_mean_adjusted = pd.DataFrame(pca.transform(df),columns=['x','y']) + pca.mean_
fig1.add_trace(go.Scatter(
x=transformed_df_mean_adjusted['x'],
y=transformed_df_mean_adjusted['y'],
mode='markers',
line_color='green',
name='new features'
))